import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from category_encoders import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline

df = pd.read_csv("dataset/bank-full.csv")

print(df.shape)
df.head()

(45211, 17)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB

df.nunique()

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64

df["job"].value_counts(normalize=True).sort_values(ascending=True).plot(kind="barh")
plt.title("Job Type of Customers");

plt.hist(df["age"],bins=20)
plt.xlabel("Age")
plt.title("Age Distribution of Customers");

#Convert duration to minutes
df["minutes"] = (df["duration"]/60).astype(int)
df["minutes"].describe()

count    45211.00000
mean         3.81739
std          4.29427
min          0.00000
25%          1.00000
50%          3.00000
75%          5.00000
max         81.00000
Name: minutes, dtype: float64

#Convert y to binary target 1 as yes and 0 as no
df["term_deposit"] = (df["y"]
    .str.replace("yes","1",regex=False)
    .str.replace("no","0",regex=False)
                ).astype(int)

df["pdays"].describe()

count    45211.000000
mean        40.197828
std        100.128746
min         -1.000000
25%         -1.000000
50%         -1.000000
75%         -1.000000
max        871.000000
Name: pdays, dtype: float64

df["pdays"].value_counts(normalize=True).head()*100

pdays
-1      81.736745
 182     0.369379
 92      0.325142
 183     0.278693
 91      0.278693
Name: proportion, dtype: float64

df.sort_values(by="pdays",ascending=False).head(25)

df["campaign"].describe()

count    45211.000000
mean         2.763841
std          3.098021
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         63.000000
Name: campaign, dtype: float64

df["campaign"].value_counts(normalize=True).head(10)*100

campaign
1     38.804716
2     27.659198
3     12.211630
4      7.790140
5      3.901705
6      2.855500
7      1.625711
8      1.194400
9      0.723275
10     0.588352
Name: proportion, dtype: float64

df["balance"].describe()

count     45211.000000
mean       1362.272058
std        3044.765829
min       -8019.000000
25%          72.000000
50%         448.000000
75%        1428.000000
max      102127.000000
Name: balance, dtype: float64

df["previous"].describe()

count    45211.000000
mean         0.580323
std          2.303441
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        275.000000
Name: previous, dtype: float64

df["previous"].value_counts(normalize=True).head(5)*100

previous
0    81.736745
1     6.131251
2     4.658158
3     2.525934
4     1.579262
Name: proportion, dtype: float64

df["poutcome"].value_counts(normalize=True)*100

poutcome
unknown    81.747805
failure    10.840282
other       4.069806
success     3.342107
Name: proportion, dtype: float64

#Updates previous outcome results of unkown and other as failure
df["previous_outcome"] = (df["poutcome"]
    .str.replace("unknown","failure",regex=False)
    .str.replace("other","failure",regex=False)
                )

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

month_order = {
    "jan":1,
    "feb":2,
    "mar":3,
    "apr":4,
    "may":5,
    "jun":6,
    "jul":7,
    "aug":8,
    "sep":9,
    "oct":10,
    "nov":11,
    "dec":12
}
type(month_order)

dict

calls_per_month = (
    df["month"]
    .replace(month_order)
    .groupby(df["y"])
    .value_counts(normalize=True)
    .rename("Frequency")
    .to_frame()
    .reset_index()
)
#side by side bar chart for successful conversion
sns.barplot(
    x="month",
    y="Frequency",
    hue="y",
    data=calls_per_month,
    order=month_order.values()
)

plt.xlabel("Month")
plt.ylabel("Frequency (%)")
plt.legend(title='Opened a Term Deposit?')
plt.title("Conversion Rate: Calls Per Month");

days_to_week = {
    range(1,8): "1st Week",
    range(8,15): "2nd Week",
    range(15,22): "3rd Week",
    range(22,32): "4th Week"
}
days_to_week.keys()

dict_keys([range(1, 8), range(8, 15), range(15, 22), range(22, 32)])

calls_per_week = (
    df["day"]
    .replace(days_to_week)
    .groupby(df["y"])
    .value_counts(normalize=True)
    .rename("Frequency")
    .to_frame()
    .reset_index()
    .sort_values(by="day")
)
#side by side bar chart for successful conversion
sns.barplot(
    x="day",
    y="Frequency",
    hue="y",
    data=calls_per_week,
)

plt.xlabel("Month")
plt.ylabel("Frequency (%)")
plt.legend(title='Opened a Term Deposit?')
plt.title("Conversion Rate: Calls Per Week");

contact = (
    df["contact"]
    .groupby(df["y"])
    .value_counts(normalize=True)
    .rename("Frequency")
    .to_frame()
    .reset_index()
)
#side by side bar chart for successful conversion
sns.barplot(
    x="contact",
    y="Frequency",
    hue="y",
    data=contact,
)

plt.xlabel("Communication by")
plt.ylabel("Frequency (%)")
plt.legend(title='Opened a Term Deposit?')
plt.title("Conversion Rate: Channel Used");

multi_correlation = df.select_dtypes("number").drop(columns="duration").corr()
#Plot heatmap of correlation
sns.heatmap(multi_correlation);

#Create boxplot
sns.boxplot(x="term_deposit",y="minutes",data=df);
plt.xlabel("Opened a Term Deposit (N/Y)")
plt.ylabel("No. of Minutes")
plt.title("Distribution of Length of Calls by Class");

#Create boxplot
sns.boxplot(x="term_deposit",y="campaign",data=df);
plt.xlabel("Opened a Term Deposit (N/Y)")
plt.ylabel("Campaign Count")
plt.title("Distribution of Outreach Made by Class");

def wrangle(filepath):
    # Read query results into DataFrame
    df = pd.read_csv(filepath)
    
    #Identify leakage, multimulticollinerality columns
    #Drops high- and low-cardinality categorical features
    drop_cols = ["pdays","previous","balance","day","month","age"]
    #Drop columns
    df.drop(columns=drop_cols,inplace=True)

    #Convert duration (in secs) to minutes
    df["minutes"] = (df["duration"]/60).astype(int)
    df.drop(columns = ["duration"],inplace=True)

    #Tag poutcome of unknown and other as failure then drops the orig column
    df["previous_outcome"] = (df["poutcome"]
    .str.replace("unknown","failure",regex=False)
    .str.replace("other","failure",regex=False))
    df.drop(columns = ["poutcome"],inplace=True)
    
    #Create binary target then drops the original column
    df["term_deposit"] = (df["y"]
    .str.replace("yes","1",regex=False)
    .str.replace("no","0",regex=False)
                ).astype(int)
    df.drop(columns = ["y"],inplace=True)
    return df

df_train = wrangle("dataset/bank-full.csv")
print(df_train.shape)
df_train.head(3)

(45211, 11)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               45211 non-null  int64 
 1   job               45211 non-null  object
 2   marital           45211 non-null  object
 3   education         45211 non-null  object
 4   default           45211 non-null  object
 5   balance           45211 non-null  int64 
 6   housing           45211 non-null  object
 7   loan              45211 non-null  object
 8   contact           45211 non-null  object
 9   day               45211 non-null  int64 
 10  month             45211 non-null  object
 11  duration          45211 non-null  int64 
 12  campaign          45211 non-null  int64 
 13  pdays             45211 non-null  int64 
 14  previous          45211 non-null  int64 
 15  poutcome          45211 non-null  object
 16  y                 45211 non-null  object
 17  minutes           45211 non-null  int64 
 18  term_deposit      45211 non-null  int64 
 19  previous_outcome  45211 non-null  object
dtypes: int64(9), object(11)
memory usage: 6.9+ MB

#Split then create baseline
target="term_deposit"
X = df_train.drop(columns=[target])
y = df_train[target]
X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.2,random_state=42
)
acc_baseline = y_train.value_counts(normalize=True).max()
print("Baseline Accuracy:", round(acc_baseline, 4))

#Create a pipeline model for iteration
model_lr = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=3000)
)
model_lr.fit(X_train,y_train)
#Accuracy score
acc_train = accuracy_score(y_train,model_lr.predict(X_train))
acc_test = model_lr.score(X_test,y_test)

print("Logistic Regression Training Accuracy:", round(acc_train,4))
print("Logistic Regression Testing Accuracy:", round(acc_test,4))

Baseline Accuracy: 0.8839
Logistic Regression Training Accuracy: 0.9008
Logistic Regression Testing Accuracy: 0.8995

#Model predict
model_lr.predict(X_train)[:5]

array([0, 0, 0, 0, 0])

#to see background of model predict
model_lr.predict_proba(X_train)[:5]

array([[0.99306942, 0.00693058],
       [0.9501959 , 0.0498041 ],
       [0.92044168, 0.07955832],
       [0.98240072, 0.01759928],
       [0.9102008 , 0.0897992 ]])

#Feature importance
features=model_lr.named_steps["onehotencoder"].get_feature_names_out()
importances=model_lr.named_steps["logisticregression"].coef_[0]

#creating odd ratios
odds_ratios = pd.Series(np.exp(importances),index=features).sort_values()
odds_ratios.head(5)

previous_outcome_failure    0.212728
contact_unknown             0.345706
housing_yes                 0.508084
loan_yes                    0.550597
default_yes                 0.653807
dtype: float64

odds_ratios.tail(5)

contact_cellular            1.261042
minutes                     1.273600
job_retired                 1.632096
job_student                 1.805086
previous_outcome_success    2.548314
dtype: float64

#bank.csv with 10% of the examples (4521), randomly selected from bank-full.csv.
df_test = wrangle("dataset/bank.csv")
X_random = df_test.drop(columns=[target])
y_random = df_test[target]
#Accuracy score
val_test = model_lr.score(X_random,y_random)
print("Logistic Regression Random Test Accuracy:", round(val_test,4))

Logistic Regression Random Test Accuracy: 0.9013

from sklearn.tree import DecisionTreeClassifier, plot_tree #Predictor
from category_encoders import OrdinalEncoder #Replacing onehotencoder

#Split validation data from train data
X_train,X_val,y_train,y_val = train_test_split(
    X_train,y_train,test_size=0.2, random_state=42
)

model_dt = make_pipeline(
    OrdinalEncoder(),
    DecisionTreeClassifier(random_state=42)
)
model_dt.fit(X_train,y_train)

dt_acc_train = accuracy_score(y_train,model_dt.predict(X_train))
dt_acc_val = model_dt.score(X_val,y_val)
dt_acc_test = model_dt.score(X_test,y_test)
print("Baseline Accuracy:", round(acc_baseline, 4))
print("Training Accuracy:", round(dt_acc_train, 4))
print("Validation Accuracy:", round(dt_acc_val, 4))
print("Test Accuracy:", round(dt_acc_test, 4))

Baseline Accuracy: 0.8839
Training Accuracy: 0.9651
Validation Accuracy: 0.8777
Test Accuracy: 0.8709

model_dt.predict(X_train)[:50]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0])

tree_depth = model_dt.named_steps["decisiontreeclassifier"].get_depth()
print("Tree Depth/Vines/Divisions:", tree_depth)

Tree Depth/Vines/Divisions: 24

#Hyperparamater tuning for decision tree
depth_hyperparams = range(1,30,2)
training_acc = []
validation_acc = []
random_acc = []
test_acc = []

for d in depth_hyperparams:
    test_model = make_pipeline(
        OrdinalEncoder(),
        DecisionTreeClassifier(max_depth=d,random_state=42)
    )
    test_model.fit(X_train,y_train)
    training_acc.append(test_model.score(X_train,y_train))
    validation_acc.append(test_model.score(X_val,y_val))
    random_acc.append(test_model.score(X_random,y_random))
    test_acc.append(test_model.score(X_test,y_test))

print("Training Accuracy Scores:", training_acc[:3])
print("Validation Accuracy Scores:", validation_acc[:3])
print("Randomized Data Accuracy Scores:", random_acc[:3])
print("Test Accuracy Scores:", test_acc[:3])

Training Accuracy Scores: [0.8842908256261393, 0.9056909471410248, 0.9058934719503139]
Validation Accuracy Scores: [0.8879589632829373, 0.8982181425485961, 0.8971382289416847]
Randomized Data Accuracy Scores: [0.8847600088476001, 0.8995797389957974, 0.9000221190002212]
Test Accuracy Scores: [0.8793541966161672, 0.8968262744664381, 0.8962733606104168]

#Adding flexibility to find the divergence
#Plot for the validation curve
plt.plot(depth_hyperparams,training_acc,label="training")
plt.plot(depth_hyperparams,validation_acc,label="validation")
plt.plot(depth_hyperparams,random_acc,label="randomized data")
plt.plot(depth_hyperparams,test_acc,label="test data")
plt.xlabel("Max Depth")
plt.ylabel("Accuracy Score")
plt.xticks(np.arange(0,31,2))
plt.yticks(np.arange(0.85,1,0.01))
plt.grid()
plt.legend();

#Create new model with tuned depth
final_dt_model = make_pipeline(
        OrdinalEncoder(),
        DecisionTreeClassifier(max_depth=8,random_state=42)
    )
final_dt_model.fit(X_train,y_train)

test_acc = final_dt_model.score(X_test,y_test)
print("Test Accuracy:", round(test_acc, 4))

Test Accuracy: 0.8944

#Create new model with tuned depth
final_dt_model = make_pipeline(
        OrdinalEncoder(),
        DecisionTreeClassifier(max_depth=3,random_state=42)
    )
final_dt_model.fit(X_train,y_train)

test_acc = final_dt_model.score(X_test,y_test)
print("Test Accuracy:", round(test_acc, 4))

Test Accuracy: 0.8968

def wrangle(filepath):
    # Read query results into DataFrame
    df = pd.read_csv(filepath)
    
    #Identify leakage, multimulticollinerality columns
    #Drops high- and low-cardinality categorical features
    drop_cols = ["pdays","previous","balance","day","month","age"]
    #Drop columns
    df.drop(columns=drop_cols,inplace=True)

    #Convert duration (in secs) to minutes
    df["minutes"] = (df["duration"]/60).astype(int)
    df.drop(columns = ["duration"],inplace=True)

    #Tag poutcome of unknown and other as failure then drops the orig column
    df["previous_outcome"] = (df["poutcome"]
    .str.replace("unknown","failure",regex=False)
    .str.replace("other","failure",regex=False))
    df.drop(columns = ["poutcome"],inplace=True)
    
    #Create binary target then drops the original column
    df["term_deposit"] = (df["y"]
    .str.replace("yes","1",regex=False)
    .str.replace("no","0",regex=False)
                ).astype(int)
    df.drop(columns = ["y"],inplace=True)
    return df

print("Baseline Accuracy:", round(acc_baseline, 4))
print("Logistic Regression Training Dataset Accuracy:", round(acc_train,4))
print("Logistic Regression Randomized Dataset Accuracy:", round(val_test,4))
print("Logistic Regression Testing Dataset Accuracy:", round(acc_test,4))

Baseline Accuracy: 0.8839
Logistic Regression Training Dataset Accuracy: 0.9008
Logistic Regression Randomized Dataset Accuracy: 0.9013
Logistic Regression Testing Dataset Accuracy: 0.8995

print("Baseline Accuracy:", round(acc_baseline, 4))
print("Decision Tree Training Dataset Accuracy:", round(final_dt_model.score(X_train,y_train), 4))
print("Decision Tree Validation Dataset Accuracy:", round(final_dt_model.score(X_val,y_val), 4))
print("Decision Tree Testing Datasest Accuracy:", round(test_acc, 4))

Baseline Accuracy: 0.8839
Decision Tree Training Dataset Accuracy: 0.9112
Decision Tree Validation Dataset Accuracy: 0.8977
Decision Tree Testing Datasest Accuracy: 0.8944

odds_ratios.tail(5).plot(kind="barh") #five largest coefficients
plt.xlabel("Odds Ratio")
plt.title("5 Most Important Features");

odds_ratios.head(5).plot(kind="barh") #five smallest coefficients
plt.xlabel("Odds Ratio")
plt.title("5 Least Important Features");

# Create larger figure
fig, ax = plt.subplots(figsize=(25, 12))
# Plot decision tree
plot_tree(
    decision_tree= final_dt_model.named_steps["decisiontreeclassifier"],
    feature_names=X_train.columns.to_list() ,
    filled=True,  # Color leaf with class
    rounded=True,  # Round leaf edges
    proportion=True,  # Display proportion of classes in leaf
    max_depth=2,  # Only display first 3 levels
    fontsize=12,  # Enlarge font
    ax=ax,  # Place in figure axis
)
plt.title("Decision Tree - Gini Impurity");

features_dt = X_train.columns
importances_dt = final_dt_model.named_steps["decisiontreeclassifier"].feature_importances_
#print("Features:",features_dt[0:3])
#print("Importances:",importances_dt[0:3])

#Transfer features and importances to dataframe
feat_imp_dt = pd.Series(importances_dt,index=features_dt).sort_values().tail(5)
#Interpreting feature and importance
fig, ax = plt.subplots(figsize=(5, 5))
feat_imp_dt.plot(kind="barh", ax=ax)
plt.xlabel("Gini Importance")
plt.xticks(np.arange(0,0.6,0.05))
plt.grid()
plt.ylabel("Feature");

Background¶

Goal¶

Dataset¶

Libraries¶

Data Wrangling¶

Step 1: Understanding the dataset¶

Step 2: Look for outliers¶

Step 3: Trends in the campaign¶

Step 4: Model Pipeline of Logistic Regression¶

Step 5: Model Pipeline of Decision Tree Classifier¶

Results¶

Wrangle Function¶

Comparing Accuracy Scores¶

Linear Regression Scores¶

Decision Tree Model Score¶

Findings¶

Feature Importances¶

Gini Impurity and Importance¶

Recommendation¶

	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	poutcome	y
0	58	management	married	tertiary	no	2143	yes	no	unknown	5	may	261	1	-1	unknown	no
1	44	technician	single	secondary	no	29	yes	no	unknown	5	may	151	1	-1	unknown	no
2	33	entrepreneur	married	secondary	no	2	yes	yes	unknown	5	may	76	1	-1	unknown	no
3	47	blue-collar	married	unknown	no	1506	yes	no	unknown	5	may	92	1	-1	unknown	no
4	33	unknown	single	unknown	no	1	no	no	unknown	5	may	198	1	-1	unknown	no

	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	previous	poutcome	y
45146	49	unemployed	divorced	tertiary	no	780	no	no	cellular	8	nov	148	1	871	2	failure	no
44829	37	management	divorced	tertiary	no	488	yes	no	cellular	17	sep	328	1	854	2	failure	yes
44837	35	management	single	tertiary	no	151	no	no	unknown	20	sep	11	1	850	2	failure	no
44858	31	housemaid	married	secondary	no	243	yes	no	cellular	23	sep	305	2	842	1	failure	yes
44785	43	blue-collar	married	secondary	no	408	yes	no	unknown	14	sep	6	1	838	3	other	no
44698	34	technician	married	secondary	no	384	yes	no	cellular	6	sep	127	2	831	1	other	no
44530	34	blue-collar	married	secondary	no	320	yes	no	cellular	12	aug	352	1	828	2	failure	yes
45024	47	admin.	married	secondary	no	1387	yes	no	cellular	14	oct	158	1	826	1	failure	no
44924	35	blue-collar	married	secondary	no	137	no	yes	unknown	4	oct	5	1	808	12	failure	no
45120	32	technician	married	secondary	no	1547	no	no	cellular	26	oct	289	1	805	4	other	yes
45037	45	management	single	tertiary	no	2048	yes	no	cellular	18	oct	310	1	804	1	failure	yes
44260	41	blue-collar	divorced	secondary	no	663	yes	no	unknown	22	jul	24	1	792	3	other	no
44815	60	retired	married	secondary	no	975	no	no	cellular	16	sep	303	1	792	1	failure	yes
44243	41	blue-collar	married	primary	no	178	yes	no	unknown	20	jul	5	1	791	1	failure	no
44287	37	technician	married	secondary	no	1707	yes	no	cellular	26	jul	546	2	784	3	failure	yes
44489	31	blue-collar	married	secondary	no	0	yes	no	unknown	10	aug	97	1	782	1	other	yes
44864	46	management	married	tertiary	no	7485	no	no	cellular	23	sep	145	1	779	2	failure	no
44832	28	admin.	married	secondary	no	242	yes	no	unknown	17	sep	47	1	779	12	failure	no
44822	27	blue-collar	married	secondary	no	821	yes	yes	unknown	16	sep	23	1	778	41	other	no
44089	37	technician	married	secondary	no	432	yes	no	cellular	6	jul	386	3	776	55	failure	yes
44604	30	blue-collar	married	primary	no	124	yes	no	unknown	28	aug	5	1	775	2	other	no
44974	39	management	married	tertiary	no	839	no	yes	cellular	11	oct	365	2	774	11	failure	no
44965	36	management	single	tertiary	no	335	no	no	unknown	10	oct	5	1	772	4	failure	no
44840	35	management	single	tertiary	no	1120	no	no	unknown	21	sep	4	1	771	2	success	no
44798	38	management	married	tertiary	no	1477	no	no	cellular	15	sep	385	3	769	2	failure	yes